#   LibAut
#       onset regressions: simple (bivariate)
#       Figure 6
#   Juraj Medzihorsky
#   2021-08-12

library(mgcv)
library(parallel)
options(mc.cores=2)
options(stringsAsFactors=F)
source('helpers.R')

#   sessionInfo()
##  R version 3.6.3 (2020-02-29)
##  Platform: x86_64-pc-linux-gnu (64-bit)
##  Running under: Ubuntu 16.04.7 LTS
##
##  Matrix products: default
##  BLAS/LAPACK: /opt/OpenBLAS/lib/libopenblas_haswellp-r0.3.9.so
##
##  locale:
##   [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8
##   [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_GB.UTF-8       LC_NAME=C
##   [9] LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
##  attached base packages:
##  [1] parallel  stats     graphics  grDevices utils     datasets  methods   base
##
##  other attached packages:
##  [1] mgcv_1.8-28    nlme_3.1-152   nvimcom_0.9-28 colorout_1.2-0
##
##  loaded via a namespace (and not attached):
##  [1] compiler_3.6.3  Matrix_1.3-4    tools_3.6.3     splines_3.6.3   grid_3.6.3      lattice_0.20-44


#------------------------------------------------------------------------------

code_dir <- getwd()
data_dir <- gsub('code$', 'data', code_dir)
figs_dir <- gsub('code$', 'figures', code_dir)
tabs_dir <- gsub('code$', 'tables', code_dir)

setwd(data_dir)
dat <- readRDS('onset_20210810.rds')
eps <- read.csv('eps.csv')
load('onset_regressions_simple_lpo_20210810.rdata') 
setwd(code_dir)
dat <- subset(dat, onset_eligible%in%1)

#------------------------------------------------------------------------------
#   get EP shares
tab_dem <- aggregate(eps$dem_ep, by=list(eps$year), FUN=mean, na.rm=T)
tab_aut <- aggregate(eps$aut_ep, by=list(eps$year), FUN=mean, na.rm=T)
names(tab_dem) <- c('year', 'share_dem_ep')
names(tab_aut) <- c('year', 'share_aut_ep')

tab <- merge(tab_dem, tab_aut, by='year')

tab_lag <- tab
tab_lag$year <- tab$year + 1
names(tab_lag)[2:3] <- paste0('lag1_', names(tab)[2:3])

tab_tab <- merge(tab, tab_lag)

dat <- merge(dat, tab_tab, by='year')

weird <- which(abs(dat$lag1_e_migdpgro)>1)
dat$lag1_e_migdpgro[weird] <- NA

dat$logpop <- log(dat$e_mipopula) 

dat$reg_fac <- as.factor(dat$e_regionpol_6C)

#   with(dat, table(reg_fac, e_regionpol_6C))

reglabs <- c('Eastern Europe & Central Asia',
             'Latin America &the Caribbean',
             'MENA, ISR, TUR',
             'Sub-Saharan Africa',
             'Western Europe, North America, CYP, AUS, NZ',
             'Asia & Pacific')

#------------------------------------------------------------------------------
#   summarize CVs

    cv_auc_d <- sapply(out_d, function(x) mean(x$auc))
    cv_auc_c <- sapply(out_c, function(x) mean(x$auc))
    cv_cc_d <- sapply(out_d, function(x) mean(x$cc))
    cv_cc_c <- sapply(out_c, function(x) mean(x$cc))

#------------------------------------------------------------------------------
#   formulas

fy0 <- 'libaut_start ~ 1'

dummies <- c('reg_fac', 'e_miinteco', 'e_miinterc')
#
conties <- 
    c('e_migdppcln', 'e_migdpgro', 'logpop', 'v2pepwrsoc', 'v2xeg_eqdr',
      'v2xnp_pres', 'v2csprtcpt', 'v2x_polyarchy', 'excl_region_edi', 'year', 
      'share_dem_ep', 'share_aut_ep')  
#
dummies <- c(dummies[1], paste0('lag1_', dummies[2:3]))
conties <- c(paste0('lag1_', conties[1:9]), conties[10],
             paste0('lag1_', conties[11:12]))

clabs <-
    c('ln GDPpc', 'GDPpc growth', 'ln Population in thousands', 'Power distributed by social group',
      'Equal distribution of resources', 'Presidentialism',
      'CSO participatory environment', 'Electoral democracy',
      'Exclusive regional EDI', 'Year', 
      'Countries in\ndemocratization episodes',
      'Countries in\nautocratization episodes')

fd <- sapply(paste0(fy0, ' + ', dummies), as.formula)
fc <- sapply(paste0(fy0, ' + ', 's(', conties, ', bs="gp")'), as.formula)
fc[[9]] <- as.formula(paste0(fy0, ' + ', 's(', conties[9], ', bs="re")'))


#   add fitted values and their ses to a model object
getfitted <- 
    function(x) 
    {
        d <- x$model
        f <- predict(x, newdata=d, se.fit=T, type='response')
        d$fit <- f$fit
        d$se.fit <- f$se.fit
        return(d)
    }

#-------------------------------------------------------------------------------
#   execute fitting

ld <- mclapply(fd, glm, data=dat, family=binomial('logit'))
gc()
lc <- lapply(fc, gam, data=dat, family=binomial('logit'))
gc()

#-------------------------------------------------------------------------------
#   process fitted models

#   get the fitted values
pd <- mclapply(ld, getfitted) 
pc <- mclapply(lc, getfitted) 
#   available subset sizes for each X
nd <- sapply(pd, nrow)
nc <- sapply(pc, nrow)
#   in-sample AUC ROC
aucd <- lapply(pd, function(x) getROC_AUC(x$fit, x$libaut_start))
aucc <- lapply(pc, function(x) getROC_AUC(x$fit, x$libaut_start))
#  
auc_d <- sapply(aucd, function(x) x$auc)
auc_c <- sapply(aucc, function(x) x$auc)

#   round(auc_d, 2)
#   round(auc_c, 2)


#   newdata for discrete Xs
ddat <- list(data.frame(reg_fac=as.factor(1:6)),
             data.frame(lag1_e_miinteco=c(0,1)),
             data.frame(lag1_e_miinterc=c(0,1)))
#   newdata for continuous Xs
newdat <- apply(dat[,conties], 2, function(x) seq(min(x, na.rm=T), max(x, na.rm=T), length.out=101))
newdat <- as.data.frame(newdat)

#   predicted values for the plot
fitd <- lapply(1:3, function(i) predict(ld[[i]], newdata=ddat[[i]], se.fit=T, type='response')) 
fitc <- lapply(lc, function(i) predict(i, newdata=newdat, se.fit=T, type='response'))


#-------------------------------------------------------------------------------
#   the figure

setwd(figs_dir)

pdf('Figure_6.pdf', 7, 8)
par(mfrow=c(3,5), mar=c(3.4,2,0.4,0.6), oma=c(0,1,0,0), las=1, cex=0.5)
yl <- c(0,1)+c(-1,+1)*gr(-7)
xline <- -1*(1-gr(-4))
for (i in 1:12) {
    plot.new()
    plot.window(xlim=range(newdat[,conties[i]]), ylim=yl)
    rug(dat[dat$libaut_start%in%1,conties[i]], side=3, lwd=gr(-4), col=natcol('blue'))
    rug(dat[dat$libaut_start%in%0,conties[i]], side=1, lwd=gr(-4), col=natcol('red'))
    box(lwd=gr(-1), col=grey(0.7))
    abline(h=c(0,1), col=grey(0.7), lty=2, lwd=gr(-2))
    av <- fitc[[i]]$fit
    lo <- pmax(0, av - 2 * fitc[[i]]$se.fit)
    hi <- pmin(1, av + 2 * fitc[[i]]$se.fit)
    tx <- newdat[,conties[i]]
    polygon(c(tx,rev(tx)), c(lo,rev(hi)), col=grey(0.9), border=grey(0.9), lwd=1e-2)
    lines(tx, av, col=grey(0.4))
    a1 <- axTicks(1)
    a2 <- seq(0, 1, by=0.2)
    pu <- par('usr')
    xm <- mean(pu[1:2])
    if (substr(clabs[i],1,2)=='ln') {
        if (clabs[i]=='ln GDPpc') {
            al1 <- c('500', '5000', '50000')  
            a1 <- log(as.numeric(al1))
            al1 <- c('500', '5,000', '50,000')  
        } else if (clabs[i]=='ln Population in thousands') {
            al1 <- c(expression(10^1), expression(10^2), expression(10^3),
                     expression(10^4), expression(10^5), expression(10^6))
            a1 <- log(10^(1:6))
        }
        axis(1, a1, rep('', length(a1)), lwd=0, lwd.ticks=gr(-1), tck=-gr(1)*1e-2, col=grey(0.7))
        axis(1, a1, al1, lwd=0, line=xline)
    } else if (i < 11) {
        axis(1, a1, rep('', length(a1)), lwd=0, lwd.ticks=gr(-1), tck=-gr(1)*1e-2, col=grey(0.7))
        axis(1, lwd=0, line=xline)
    } else if (i >= 11) {
        al1 <- paste0(round(a1*1e2),'%')
        axis(1, a1, rep('', length(a1)), lwd=0, lwd.ticks=gr(-1), tck=-gr(1)*1e-2, col=grey(0.7))
        axis(1, a1, al1, lwd=0, line=xline)
    }
    axis(1, xm, clabs[i], lwd=0, line=ifelse(i%in%c(11,12), xline+2, xline+1))
    if (i %in% c(1,6,11)) {
        axis(2, a2, rep('', length(a2)), lwd=0, lwd.ticks=gr(-1), tck=-gr(1)*1e-2, col=grey(0.7))
        axis(2, lwd=0, line=-gr(-1))
    } 
    axis(2, a2, rep('', length(a2)), lwd=0, lwd.ticks=gr(-1), tck=+gr(1)*1e-2, col=grey(0.7))
    axis(4, a2, rep('', length(a2)), lwd=0, lwd.ticks=gr(-1), tck=+gr(1)*1e-2, col=grey(0.7))
    ny1 <- sum(pc[[i]]$libaut_start%in%1)
    ny0 <- format(sum(pc[[i]]$libaut_start%in%0), big.mark=',')
    em <- strheight('M')
    text(xm, 1+em*(1-gr(-3)), if(i==1) { bquote(N['Y=1']==.(ny1)) } else { bquote(.(ny1))}, col=natcol('blue'))
    text(xm, 0-em*(1-gr(-3)), if(i==1) { bquote(N['Y=0']==.(ny0)) } else { bquote(.(ny0))}, col=natcol('red'))
    text(xm, 1 - strheight('M', cex=gr(1)), paste0('LPO-AUC = ', round(cv_auc_c[i], 2)), pos=1, offset=0)
}
    plot.new()
    plot.window(xlim=c(0,4), ylim=yl)
    lines(c(gr(-4),2-gr(-4)), c(0,0), col=grey(0.7), lty=2, lwd=gr(-2))
    lines(c(gr(-4),2-gr(-4)), c(1,1), col=grey(0.7), lty=2, lwd=gr(-2))
    lines(c(2+gr(-4),4-gr(-4)), c(0,0), col=grey(0.7), lty=2, lwd=gr(-2))
    lines(c(2+gr(-4),4-gr(-4)), c(1,1), col=grey(0.7), lty=2, lwd=gr(-2))
    rect(gr(-4), pu[3], 2-gr(-4), pu[4], lwd=gr(-1), border=grey(0.7))
    rect(2+gr(-4), pu[3], 4-gr(-4), pu[4], lwd=gr(-1), border=grey(0.7))
    axis(1, c(1,3), c('International', 'Domestic'), lwd=0, line=xline+1)
    axis(1, 2, 'Armed Conflict', lwd=0, line=xline+2)
    td <- dist(par('usr')[1:2])*gr(1)*1e-2
    invisible(sapply(a2, function(j) lines(0+gr(-4)+c(0,td), rep(j, 2), col=grey(0.7), lwd=gr(-1))))
    invisible(sapply(a2, function(j) lines(2+gr(-4)+c(0,td), rep(j, 2), col=grey(0.7), lwd=gr(-1))))
    invisible(sapply(a2, function(j) lines(2-gr(-4)-c(0,td), rep(j, 2), col=grey(0.7), lwd=gr(-1))))
    invisible(sapply(a2, function(j) lines(4-gr(-4)-c(0,td), rep(j, 2), col=grey(0.7), lwd=gr(-1))))
#   axis(2, a2, rep('', length(a2)), lwd=0, lwd.ticks=gr(-1), tck=+gr(1)*1e-2, col=grey(0.7))
#   axis(4, a2, rep('', length(a2)), lwd=0, lwd.ticks=gr(-1), tck=+gr(1)*1e-2, col=grey(0.7))
    text(1, 1 - strheight('M', cex=gr(1)), paste0('LPO-AUC\n=\n', round(cv_auc_d[2], 2)), pos=1, offset=0)
    text(3, 1 - strheight('M', cex=gr(1)), paste0('LPO-AUC\n=\n', round(cv_auc_d[3], 2)), pos=1, offset=0)
    text(1, 1, sum(pd[[2]]$libaut_start%in%1), col=natcol('blue'), pos=3, offset=gr(-3))
    text(1, 0, format(sum(pd[[2]]$libaut_start%in%0), big.mark=','), col=natcol('red'),  pos=1, offset=gr(-3))
    text(3, 1, sum(pd[[3]]$libaut_start%in%1), col=natcol('blue'), pos=3, offset=gr(-3))
    text(3, 0, format(sum(pd[[3]]$libaut_start%in%0), big.mark=','), col=natcol('red'),  pos=1, offset=gr(-3))
    lines(c(1,1)-gr(-3),
          c(fitd[[2]]$fit[1]-1.96*fitd[[2]]$se.fit[1],
            fitd[[2]]$fit[1]+1.96*fitd[[2]]$se.fit[1]),
          col=grey(0.4), lend='butt')
    lines(c(1,1)+gr(-3),
          c(fitd[[2]]$fit[2]-1.96*fitd[[2]]$se.fit[2],
            fitd[[2]]$fit[2]+1.96*fitd[[2]]$se.fit[2]),
          col=grey(0.4), lend='butt')
    lines(c(3,3)-gr(-3),
          c(fitd[[3]]$fit[1]-1.96*fitd[[3]]$se.fit[1],
            fitd[[3]]$fit[1]+1.96*fitd[[3]]$se.fit[1]),
          col=grey(0.4), lend='butt')
    lines(c(3,3)+gr(-3),
          c(fitd[[3]]$fit[2]-1.96*fitd[[3]]$se.fit[2],
            fitd[[3]]$fit[2]+1.96*fitd[[3]]$se.fit[2]),
          col=grey(0.4), lend='butt')
    points(1+c(-1,+1)*gr(-3), fitd[[2]]$fit, pch=16, col=grey(0.4))
    points(3+c(-1,+1)*gr(-3), fitd[[3]]$fit, pch=16, col=grey(0.4))
    a1 <- c(1+c(-1,+1)*gr(-3), 3+c(-1,+1)*gr(-3))
    axis(1, a1, rep('', length(a1)), lwd=0, lwd.ticks=gr(-1), tck=-gr(1)*1e-2, col=grey(0.7))
    axis(1, a1, c('0', '1', '0', '1'), lwd=0, line=xline)
#
    plot.new()
    plot.window(xlim=c(0,7)+c(-1,1)*gr(-2), ylim=yl)
    abline(h=c(0,1), col=grey(0.7), lty=2, lwd=gr(-2))
    box(lwd=gr(-1), col=grey(0.7))
    for (i in 1:6) {
        lines(rep(i,2), 
              c(max(0, fitd[[1]]$fit[i]-1.96*fitd[[1]]$se.fit[i]),    
                min(1, fitd[[1]]$fit[i]+1.96*fitd[[1]]$se.fit[i])),
              col=grey(0.4), lend='butt')
    }
    pu <- par('usr')
    xm <- mean(pu[1:2])
    a1 <- 1:6 
    points(a1, fitd[[1]]$fit, pch=16, col=grey(0.4))
    text(xm, 1, sum(pd[[1]]$libaut_start%in%1), col=natcol('blue'), pos=3, offset=gr(-3))
    text(xm, 0, format(sum(pd[[1]]$libaut_start%in%0), big.mark=','), col=natcol('red'),  pos=1, offset=gr(-3))
    text(xm,
         1 - strheight('M', cex=gr(1)), paste0('LPO-AUC = ', round(cv_auc_d[1], 2)), pos=1, offset=0)
    axis(1, xm, 'Region ', lwd=0, line=xline + 1)
    axis(2, a2, rep('', length(a2)), lwd=0, lwd.ticks=gr(-1), tck=+gr(1)*1e-2, col=grey(0.7))
    axis(4, a2, rep('', length(a2)), lwd=0, lwd.ticks=gr(-1), tck=+gr(1)*1e-2, col=grey(0.7))
    text(1:6, fitd[[1]]$fit+strheight('M', cex=gr(-1)), reglabs, cex=gr(-1), pos=4, offset=0, srt=90)
    mtext('Predicted probability', side=2, cex=gr(-1), outer=T, las=3)
    mtext('Predicted probability', side=2, cex=gr(-1), outer=T, las=3, at=1/6)
    mtext('Predicted probability', side=2, cex=gr(-1), outer=T, las=3, at=5/6)
dev.off()

#   SCRIPT END